/*
* ---------------
*
* This file is derivative work.
*
* Copyright 2010-2011 Øyvind Berg (elacin [at] gmail.com)
*
* ---------------- Original notice: ----------------
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*
*/
package org.elacin.pdfextract.datasource.pdfbox;
import org.apache.fontbox.util.BoundingBox;
import org.apache.log4j.Logger;
import org.apache.log4j.MDC;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdfviewer.PageDrawer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDMatrix;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.TextNormalize;
import org.apache.pdfbox.util.TextPosition;
import org.elacin.pdfextract.Constants;
import org.elacin.pdfextract.content.PhysicalText;
import org.elacin.pdfextract.datasource.DocumentContent;
import org.elacin.pdfextract.datasource.PageContent;
import org.elacin.pdfextract.datasource.graphics.DrawingSurface;
import org.elacin.pdfextract.datasource.graphics.DrawingSurfaceImpl;
import org.elacin.pdfextract.geom.MathUtils;
import org.elacin.pdfextract.geom.Rectangle;
import org.jetbrains.annotations.NotNull;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.io.IOException;
import java.util.*;
import java.util.List;
import static org.elacin.pdfextract.Constants.USE_EXISTING_WHITESPACE;
public class PDFBoxIntegration extends PageDrawer {
// ------------------------------ FIELDS ------------------------------
private static final Logger log = Logger.getLogger(PDFBoxIntegration.class);
@NotNull
private static final byte[] SPACE_BYTES = { (byte) 32 };
private static final boolean NO_DESCENDERS = true;
/**
* page state
*/
@NotNull
protected final DrawingSurface graphicsDrawer = new DrawingSurfaceImpl();
/* i couldnt find this information anywhere, so calculate it and cache it here */
@NotNull
protected Map<PDFont, Boolean> areFontsMonospaced = new HashMap<PDFont, Boolean>();
/* The normalizer is used to remove text ligatures/presentation forms and to correct
the direction of right to left text, such as Arabic and Hebrew. */
@NotNull
private final TextNormalize normalize = new TextNormalize("UTF-8");
@NotNull
private final List<ETextPosition> charactersForPage = new ArrayList<ETextPosition>();
@NotNull
private final Map<String, List<TextPosition>> characterListMapping = new HashMap<String,
List<TextPosition>>();
private BasicStroke basicStroke;
private int currentPageNo;
/**
* pdfbox state
*/
/* used to filter out text which is written several times to create a bold effect */
private final PDDocument doc;
/**
* document state
*/
public DocumentContent docContent;
private final int endPage;
public Fonts fonts;
public float rotation;
private final int startPage;
// --------------------------- CONSTRUCTORS ---------------------------
public PDFBoxIntegration(final PDDocument doc, final int startPage, final int endPage)
throws IOException {
this.doc = doc;
this.startPage = startPage;
this.endPage = endPage;
}
// ------------------------ OVERRIDING METHODS ------------------------
@Override
public void drawImage(Image awtImage, AffineTransform at) {
final Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();
graphicsDrawer.drawImage(awtImage, at, currentClippingPath);
}
@Override
public void drawPage(final Graphics g, final PDPage p, final Dimension pageDimension)
throws IOException {
super.drawPage(g, p, pageDimension);
}
@Override
public void fillPath(int windingRule) throws IOException {
Color currentColor = getGraphicsState().getNonStrokingColor().getJavaColor();
getLinePath().setWindingRule(windingRule);
final Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();
graphicsDrawer.fill(getLinePath(), currentColor, currentClippingPath);
getLinePath().reset();
}
@NotNull
@Override
public Graphics2D getGraphics() {
throw new RuntimeException("PDFBoxSource does not have Graphics2D");
}
@Override
public BasicStroke getStroke() {
return basicStroke;
}
/**
* Old version
*/
public void processEncodedText(@NotNull byte[] string) throws IOException {
/*
* Note on variable names. There are three different units being used
* in this code. Character sizes are given in glyph units, text locations
* are initially given in text units, and we want to save the data in
* display units. The variable names should end with Text or Disp to
* represent if the values are in text or disp units (no glyph units are saved).
*/
final float fontSizeText = getGraphicsState().getTextState().getFontSize();
final float horizontalScalingText = getGraphicsState().getTextState()
.getHorizontalScalingPercent() / 100f;
// float verticalScalingText = horizontalScaling;//not sure if this is right but what else to
// do???
final float riseText = getGraphicsState().getTextState().getRise();
final float wordSpacingText = getGraphicsState().getTextState().getWordSpacing();
final float characterSpacingText = getGraphicsState().getTextState().getCharacterSpacing();
/*
* We won't know the actual number of characters until
* we process the byte data(could be two bytes each) but
* it won't ever be more than string.length*2(there are some cases
* were a single byte will result in two output characters "fi"
*/
final PDFont font = getGraphicsState().getTextState().getFont();
/*
* This will typically be 1000 but in the case of a type3 font this might be a different
* number
*/
final float glyphSpaceToTextSpaceFactor;
if (font instanceof PDType3Font) {
PDMatrix fontMatrix = font.getFontMatrix();
float fontMatrixXScaling = fontMatrix.getValue(0, 0);
glyphSpaceToTextSpaceFactor = 1.0f / fontMatrixXScaling;
} else {
glyphSpaceToTextSpaceFactor = /* 1.0f / */ 1000f;
}
float spaceWidthText = 0.0F;
try {
spaceWidthText = (font.getFontWidth(SPACE_BYTES, 0, 1) / glyphSpaceToTextSpaceFactor);
} catch (Throwable exception) {
log.warn(exception, exception);
}
if (spaceWidthText == 0.0F) {
spaceWidthText = (font.getAverageFontWidth() / glyphSpaceToTextSpaceFactor);
spaceWidthText *= .80f;
}
/* Convert textMatrix to display units */
final Matrix initialMatrix = new Matrix();
initialMatrix.setValue(0, 0, 1.0F);
initialMatrix.setValue(0, 1, 0.0F);
initialMatrix.setValue(0, 2, 0.0F);
initialMatrix.setValue(1, 0, 0.0F);
initialMatrix.setValue(1, 1, 1.0F);
initialMatrix.setValue(1, 2, 0.0F);
initialMatrix.setValue(2, 0, 0.0F);
initialMatrix.setValue(2, 1, riseText);
initialMatrix.setValue(2, 2, 1.0F);
final Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
final Matrix dispMatrix = initialMatrix.multiply(ctm);
Matrix textMatrixStDisp = getTextMatrix().multiply(dispMatrix);
final float xScaleDisp = textMatrixStDisp.getXScale();
final float yScaleDisp = textMatrixStDisp.getYScale();
final float spaceWidthDisp = spaceWidthText * xScaleDisp * fontSizeText;
final float wordSpacingDisp = wordSpacingText * xScaleDisp * fontSizeText;
float maxVerticalDisplacementText = 0.0F;
StringBuilder characterBuffer = new StringBuilder(string.length);
int codeLength = 1;
for (int i = 0; i < string.length; i += codeLength) {
// Decode the value to a Unicode character
codeLength = 1;
String c = font.encode(string, i, codeLength);
if ((c == null) && (i + 1 < string.length)) {
// maybe a multibyte encoding
codeLength++;
c = font.encode(string, i, codeLength);
}
c = inspectFontEncoding(c);
// todo, handle horizontal displacement
// get the width and height of this character in text units
float fontWidth = font.getFontWidth(string, i, codeLength) *0.95f;
if (fontWidth == 0.0f) {
fontWidth = spaceWidthDisp;
}
float characterHorizontalDisplacementText = (fontWidth / glyphSpaceToTextSpaceFactor);
maxVerticalDisplacementText = Math.max(maxVerticalDisplacementText,
font.getFontHeight(string, i, codeLength) / glyphSpaceToTextSpaceFactor);
if (maxVerticalDisplacementText <= 0.0f) {
maxVerticalDisplacementText = font.getFontBoundingBox().getHeight()
/ glyphSpaceToTextSpaceFactor;
}
/**
* PDF Spec - 5.5.2 Word Spacing
*
* Word spacing works the same was as character spacing, but applies
* only to the space character, code 32.
*
* Note: Word spacing is applied to every occurrence of the single-byte
* character code 32 in a string. This can occur when using a simple
* font or a composite font that defines code 32 as a single-byte code.
* It does not apply to occurrences of the byte value 32 in multiple-byte
* codes.
*
* RDD - My interpretation of this is that only character code 32's that
* encode to spaces should have word spacing applied. Cases have been
* observed where a font has a space character with a character code
* other than 32, and where word spacing (Tw) was used. In these cases,
* applying word spacing to either the non-32 space or to the character
* code 32 non-space resulted in errors consistent with this interpretation.
*/
float spacingText = characterSpacingText;
if ((string[i] == (byte) 0x20) && (codeLength == 1)) {
spacingText += wordSpacingText;
}
/*
* The text matrix gets updated after each glyph is placed. The updated
* version will have the X and Y coordinates for the next glyph.
*/
Matrix glyphMatrixStDisp = getTextMatrix().multiply(dispMatrix);
// The adjustment will always be zero. The adjustment as shown in the
// TJ operator will be handled separately.
float adjustment = 0.0F;
// TODO : tx should be set for horizontal text and ty for vertical text
// which seems to be specified in the font (not the direction in the matrix).
float tx = ((characterHorizontalDisplacementText - adjustment / glyphSpaceToTextSpaceFactor)
* fontSizeText) * horizontalScalingText;
Matrix td = new Matrix();
td.setValue(2, 0, tx);
float ty = 0.0F;
td.setValue(2, 1, ty);
setTextMatrix(td.multiply(getTextMatrix()));
Matrix glyphMatrixEndDisp = getTextMatrix().multiply(dispMatrix);
float sx = spacingText * horizontalScalingText;
Matrix sd = new Matrix();
sd.setValue(2, 0, sx);
float sy = 0.0F;
sd.setValue(2, 1, sy);
setTextMatrix(sd.multiply(getTextMatrix()));
float widthText = glyphMatrixEndDisp.getXPosition() - glyphMatrixStDisp.getXPosition();
characterBuffer.append(c);
Matrix textMatrixEndDisp = glyphMatrixEndDisp;
float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText
* yScaleDisp;
try {
final ETextPosition text = new ETextPosition(page, textMatrixStDisp, textMatrixEndDisp,
totalVerticalDisplacementDisp, new float[] { widthText },
spaceWidthDisp, characterBuffer.toString(), font,
fontSizeText,
(int) (fontSizeText * getTextMatrix().getXScale()),
wordSpacingDisp);
correctPosition(font, string, i, c, fontSizeText, glyphSpaceToTextSpaceFactor,
horizontalScalingText, codeLength, text);
processTextPosition(text);
} catch (Exception e) {
log.warn("LOG00570:Error adding '" + characterBuffer + "': " + e.getMessage());
}
textMatrixStDisp = getTextMatrix().multiply(dispMatrix);
characterBuffer.setLength(0);
}
}
/**
* This will process a TextPosition object and add the text to the list of characters on a page.
* <p/> This method also filter out unwanted textpositions .
*
* @param text The text to process.
*/
protected void processTextPosition(@NotNull TextPosition text_) {
ETextPosition text = (ETextPosition) text_;
if (text.getFontSize() == 0.0f) {
if (log.isDebugEnabled()) {
log.debug("LOG01100:ignoring text " + text.getCharacter() + " because fontSize is 0");
}
return;
}
if (!USE_EXISTING_WHITESPACE && "".equals(text.getCharacter().trim())) {
return;
}
if (text.getCharacter().length() == 0) {
if (log.isDebugEnabled()) {
log.debug("LOG01110:Tried to render no text. wtf?");
}
return;
}
// java.awt.Rectangle javapos = new java.awt.Rectangle((int) text.getPos().x,
// (int) text.getPos().y, (int) text.getPos().width,
// (int) text.getPos().height);
//
// if (!getGraphicsState().getCurrentClippingPath().intersects(javapos)) {
// if (log.isDebugEnabled()) {
// log.debug("LOG01090:Dropping text \"" + text.getCharacter() + "\" because it "
// + "was outside clipping path");
// }
//
// return;
// }
if (!textAlreadyRenderedAtSamePlace(text)) {
if (log.isDebugEnabled()) {
log.debug("LOG00770: ignoring text " + text.getCharacter()
+ " because it seems to be rendered two times");
}
return;
}
if (!MathUtils.isWithinPercent(text.getDir(), rotation, 1)) {
if (log.isDebugEnabled()) {
log.debug("LOG00560: ignoring textposition " + text.getCharacter() + "because it has "
+ "wrong rotation. TODO :)");
}
return;
}
/**
* In the wild, some PDF encoded documents put diacritics (accents on
* top of characters) into a separate Tj element. When displaying them
* graphically, the two chunks get overlayed. With text output though,
* we need to do the overlay. This code recombines the diacritic with
* its associated character if the two are consecutive.
*/
if (charactersForPage.isEmpty()) {
charactersForPage.add(text);
} else {
/**
* test if we overlap the previous entry. Note that we are making an assumption that we
* need to only look back one TextPosition to find what we are overlapping.
* This may not always be true.
*/
TextPosition previousTextPosition = charactersForPage.get(charactersForPage.size() - 1);
if (text.isDiacritic() && previousTextPosition.contains(text)) {
previousTextPosition.mergeDiacritic(text, normalize);
}
/**
* If the previous TextPosition was the diacritic, merge it into this one and remove it
* from the list.
*/
else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
text.mergeDiacritic(previousTextPosition, normalize);
charactersForPage.remove(charactersForPage.size() - 1);
charactersForPage.add(text);
} else {
charactersForPage.add(text);
}
}
}
@Override
public void setStroke(final BasicStroke newStroke) {
basicStroke = newStroke;
}
@Override
public void strokePath() throws IOException {
Color currentColor = getGraphicsState().getStrokingColor().getJavaColor();
final Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();
graphicsDrawer.strokePath(getLinePath(), currentColor, currentClippingPath);
getLinePath().reset();
}
// -------------------------- PUBLIC METHODS --------------------------
public DocumentContent getContents() {
return docContent;
}
public void processDocument() throws IOException {
resetEngine();
try {
if (doc.isEncrypted()) {
doc.decrypt("");
}
} catch (Exception e) {
throw new RuntimeException("Could not decrypt document", e);
}
currentPageNo = 0;
docContent = new DocumentContent();
fonts = new Fonts();
for (final PDPage nextPage : (List<PDPage>) doc.getDocumentCatalog().getAllPages()) {
PDStream contentStream = nextPage.getContents();
currentPageNo++;
if (contentStream != null) {
COSStream contents = contentStream.getStream();
processPage(nextPage, contents);
}
}
docContent.setStyles(fonts.styles.values());
}
// -------------------------- OTHER METHODS --------------------------
private void correctPosition(@NotNull final PDFont fontObj, final byte[] string, final int i,
@NotNull final String c, final float fontSizeText,
final float glyphSpaceToTextSpaceFactor, float horizontalScalingText,
final int codeLength, @NotNull final ETextPosition text)
throws IOException {
/**
* Provide precise positioning of glyphs.
*
* There are several problems right which needs to be worked around:
*
* 1. Sometimes the PDF will make room for a glyph which belongs to a font with
* one or more very tall glyphs by jumping up on the page before drawing.
* Since most glyphs are (much) shorter than the tallest one, we need to make
* up for that by adjusting the Y coordinate back down. The distance which
* is jumped up is embedded in the PDF files, so there is no other way to go
* about this.
*
* 'beforeRoomForGlyph' is the position we were at before the jump back.
* Then we need to add spaceOverChar which is my estimate of where the glyph
* should begin. the result is kept in 'startY'
*
* 2. The default height we get might also be too big, so recalculate that based
* on character bounding
*
*/
final BoundingBox character = fontObj.getCharacterBoundingBox(string, i, codeLength);
PDRectangle fontBB = null;
try {
fontBB = fontObj.getFontBoundingBox();
} catch (RuntimeException e) {
// ignore, this is frequently not implemented
}
final Rectangle pos = text.getPos();
float adjust = (fontSizeText * horizontalScalingText) / glyphSpaceToTextSpaceFactor;
adjust *= getTextMatrix().getXScale();
final Rectangle newPos;
if ((character != null) && (fontBB != null) && (character.getHeight() > 0.0f)
&& (fontBB.getHeight() > 0.0f)) {
/* remove the upper and lower bounds filtered away by character */
final float spaceUnderChar = Math.min(fontBB.getLowerLeftY(), character.getLowerLeftY());
final float spaceOverChar = fontBB.getUpperRightY() - character.getUpperRightY();
final float fontHeight = fontBB.getHeight();
/* calculate the upper left corner of the rendered glyph */
float yStart = pos.endY - adjust * fontHeight;
yStart += adjust * spaceOverChar;
yStart -= adjust * spaceUnderChar;
yStart -= pos.height;
/* determine start X coordinate. */
final float x;
if (isMonoSpacedFont(fontObj)) {
x = pos.x;
} else {
// float leftOfText = text.getX() - (adjust * fontBB.getWidth());
//
// x = leftOfText + adjust * character.getLowerLeftX();
x = pos.x;
}
/*
* It was much easier to write the word segmentation code with full font width,
* so lets keep that. I havent seen this causing any problems
*/
float w = pos.width;
/*
* Line segmentation code was obviously much easier by not having any descenders which
* can even overlap into the following line. Math symbols need to stay full length
*/
final float characterHeight;
if (NO_DESCENDERS && (Character.getType(c.charAt(0)) != (int) Character.MATH_SYMBOL)) {
characterHeight = character.getUpperRightY();
} else {
characterHeight = character.getHeight();
}
float h = adjust * (characterHeight);
/* correct if the NO_DESCENDERS hack made this character have no height*/
if (NO_DESCENDERS && h < 0.1f){
h = pos.height;
}
newPos = new Rectangle(x, yStart, w, h);
} else {
/*
* here we have a lot less information, so keep most of what was calculated. Just offset
* the Y coordinate
*/
float h = pos.height;
float w = pos.width;
float startY = pos.y - h;// * 0.8f;
if (fontObj instanceof PDType3Font) {
/*
* type 3 fonts typically have almost no information
* try to mitigate the damage by keeping them small.
*/
h *= 0.5f;
startY += h; /* this is a _very_ quick and dirty hack */
}
newPos = new Rectangle(pos.x, startY, w, h);
}
if (log.isTraceEnabled()) {
log.trace("LOG00730:Text " + c + ", " + "pos from " + pos + " to " + newPos);
}
text.setBaseLine(pos.y);
text.setPos(newPos);
}
private void filterOutBadFonts(@NotNull List<ETextPosition> text) {
final Map<PDFont, Integer> badCharsForStyle = new HashMap<PDFont, Integer>();
final Map<PDFont, Integer> numCharsForStyle = new HashMap<PDFont, Integer>();
for (TextPosition tp : text) {
if (!badCharsForStyle.containsKey(tp.getFont())) {
badCharsForStyle.put(tp.getFont(), 0);
numCharsForStyle.put(tp.getFont(), 0);
}
char c = tp.getCharacter().charAt(0);
if (Character.isISOControl(c)) {
badCharsForStyle.put(tp.getFont(), badCharsForStyle.get(tp.getFont()) + 1);
}
numCharsForStyle.put(tp.getFont(), numCharsForStyle.get(tp.getFont()) + 1);
}
final Collection<PDFont> ignoredFonts = new ArrayList<PDFont>();
for (Map.Entry<PDFont, Integer> pdFontIntegerEntry : numCharsForStyle.entrySet()) {
int badChars = badCharsForStyle.get(pdFontIntegerEntry.getKey());
int totalChars = pdFontIntegerEntry.getValue();
if (badChars > totalChars * 0.10f) {
ignoredFonts.add(pdFontIntegerEntry.getKey());
log.warn("LOG01060:Ignoring all content using font "
+ pdFontIntegerEntry.getKey().getBaseFont() + " as it "
+ "seems to be missing UTF-8 conversion information");
}
}
for (Iterator<ETextPosition> iterator = text.iterator(); iterator.hasNext(); ) {
TextPosition tp = iterator.next();
if (ignoredFonts.contains(tp.getFont())) {
iterator.remove();
}
}
}
private void filterOutControlCodes(@NotNull List<ETextPosition> text) {
for (Iterator<ETextPosition> iterator = text.iterator(); iterator.hasNext(); ) {
TextPosition tp = iterator.next();
if (Character.isISOControl(tp.getCharacter().charAt(0))) {
if (log.isDebugEnabled()) {
log.debug("Removing character \"" + tp.getCharacter() + "\"");
}
iterator.remove();
}
}
}
private boolean textAlreadyRenderedAtSamePlace(@NotNull final TextPosition text) {
String c = text.getCharacter();
List<TextPosition> sameTextCharacters = characterListMapping.get(c);
if (sameTextCharacters == null) {
sameTextCharacters = new ArrayList<TextPosition>();
characterListMapping.put(c, sameTextCharacters);
return true;
}
/**
* RDD - Here we compute the value that represents the end of the rendered
* text. This value is used to determine whether subsequent text rendered
* on the same line overwrites the current text.
*
* We subtract any positive padding to handle cases where extreme amounts
* of padding are applied, then backed off (not sure why this is done, but there
* are cases where the padding is on the order of 10x the character width, and
* the TJ just backs up to compensate after each character). Also, we subtract
* an amount to allow for kerning (a percentage of the width of the last
* character).
*/
boolean suppressCharacter = false;
final float tolerance = (text.getWidth() / (float) c.length()) / 3.0f;
for (TextPosition other : sameTextCharacters) {
String otherChar = other.getCharacter();
float charX = other.getX();
float charY = other.getY();
if ((otherChar != null) && MathUtils.isWithinVariance(charX, text.getX(), tolerance)
&& MathUtils.isWithinVariance(charY, text.getY(), tolerance)) {
suppressCharacter = true;
}
}
boolean alreadyThere = true;
if (!suppressCharacter) {
sameTextCharacters.add(text);
alreadyThere = false;
}
return !alreadyThere;
}
private boolean isMonoSpacedFont(@NotNull PDFont fontObj) {
if (areFontsMonospaced.containsKey(fontObj)) {
return areFontsMonospaced.get(fontObj);
}
List<Float> widths = fontObj.getWidths();
boolean monospaced = true;
if (widths == null) {
monospaced = false;
} else {
final float firstWidth = widths.get(0);
for (int i = 1; i < widths.size(); i++) {
final float width = widths.get(i);
if (!MathUtils.isWithinPercent(width, firstWidth, 1.0f)) {
monospaced = false;
break;
}
}
}
if (monospaced) {
log.debug("LOG01080:Font " + fontObj.getBaseFont() + " is monospaced");
}
areFontsMonospaced.put(fontObj, monospaced);
return monospaced;
}
/**
* This will process the contents of a page.
*
* @param page The page to process.
* @param content The contents of the page.
* @throws IOException If there is an error processing the page.
*/
protected void processPage(@NotNull PDPage page, COSStream content) throws IOException {
if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) {
/* show which page we are working on in the log */
MDC.put("page", currentPageNo);
charactersForPage.clear();
characterListMapping.clear();
pageSize = page.findCropBox().createDimension();
rotation = (float) page.findRotation();
/* this is used to 'draw' images on during pdf parsing */
graphicsDrawer.clearSurface();
setGraphicsState(null);
resetEngine();
processStream(page, page.findResources(), content);
filterOutBadFonts(charactersForPage);
/* filter out remaining definite bad characters */
filterOutControlCodes(charactersForPage);
List<PhysicalText> texts = new ArrayList<PhysicalText>(charactersForPage.size());
for (ETextPosition tp : charactersForPage) {
texts.add(tp.convertText(fonts));
}
final PDRectangle mediaBox = page.findMediaBox();
Rectangle dimensions = new Rectangle(mediaBox.getLowerLeftX(),
mediaBox.getLowerLeftY(), mediaBox.getWidth(),
mediaBox.getHeight());
PageContent thisPage = new PageContent(texts, graphicsDrawer.getGraphicContents(),
currentPageNo, dimensions);
docContent.addPage(thisPage);
MDC.remove("page");
}
}
@Override public void SHFill(final COSName ShadingName) throws IOException {
super.SHFill(ShadingName);
}
}